# ================================================================================================ #
# ERGM workshop: Why dyadic regression fails or doesn't fail
# Benjamin Rosche
# SDL Workshop - April 2023
# ================================================================================================ #

library(statnet)
library(dplyr)
library(GGally)

# ================================================================================================ #
# Create network
# ================================================================================================ #

# Create network in which males send out and receive fewer edges
n <- 100
dat.attrs <- data.frame(id=1:n, male=as.character(rep(c(0,1), n/2)))

g0 <- network.initialize(n)
set.vertex.attribute(g0, "male", dat.attrs %>% pull(male))

g <- simulate_formula(g0 ~ edges + nodefactor("male") + nodematch("male"), coef=c(-2, -2, 0), seed=4)

# This is how this network looks like
ggnet2(g,  color = "male", palette = "Set1", edge.color="black", size=5, edge.size = 1)

# ================================================================================================ #
# ERGM as benchmark
# ================================================================================================ #

ergm(g ~ edges + nodeofactor("male") + nodeifactor("male") + nodematch("male")) %>% summary()

# ================================================================================================ #
# Dyadic regression using logistic regression
# ================================================================================================ #

# Create edgelist data
dat.edgelist <-
  expand.grid(id_from=1:n, id_to=1:n) %>% 
  filter(id_from!=id_to) %>% 
  mutate(tie=0) %>% 
  rows_update(
    g %>% 
      as.edgelist(., output="tibble") %>% 
      rename(id_from=1, id_to=2) %>% 
      mutate(tie=1), 
    by=c("id_from", "id_to")) %>% 
  left_join(
    SAS(dat.attrs, vars=c("id", "male"), suffix="_from"),
    by="id_from"
  ) %>% 
  left_join(
    SAS(dat.attrs, vars=c("id", "male"), suffix="_to"),
    by="id_to"
  ) %>% 
  mutate(samesex=male_from==male_to) %>% 
  arrange(id_from, id_to) 

dat.edgelist %>% head()
dat.edgelist %>% dim()

# Run dyadic regression 
glm(tie ~ samesex, family = "binomial", data=dat.edgelist) %>% summary()
# Dyadic regression identifies sex homophily as significant (both coefficient and SE are biased)

# Run dyadic regression with true data generation process
glm(tie ~ male_from + male_to + samesex, family = "binomial", data=dat.edgelist) %>% summary()

# If we have the true data generation process in the model, the estimates are correct.

# ================================================================================================ #
# Discussion
# ================================================================================================ #

# Even though this example shows that ERGMs reduce to logistic regression if no endogenous network
# mechanisms are modeled. Using glm() rather than ergm() has two disadvantages:
# 1. To model network formation, we need to create an edgelist of all possible ties. With 100 nodes, 
#    we have to create 9900 rows for all possible directed ties. We quickly end up with very large 
#    dataframes.
# 2. With logistic regression, it is difficult/impossible to control for endogenous network 
#    mechanisms. While differences in sociality and popularity (as shown here) are easy to implement, 
#    endogenous terms, such as triadic closure, are impossible to add to a glm() model. 
#    ERGMs allow you to implement such terms.

# eof
